import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import datetime
import ipywidgets as widgets
from IPython.display import display, Javascript
New York is the city that never sleeps and it is often known as the busiest city in world. The taxis in New York are the lifeblood of the city. There are over 13,000 active yellow cabs in NYC and they do millions of trips in a month. We decided to analyze this data. The following analysis is about the yellow taxi trips in New York for one month.

The data was in a parquet file, so we read it in a dataframe and converted it to a csv format

parquet_file = 'https://github.com/pegasus-99/NYC_yellowcab_data_visualization/blob/main/yellow_tripdata.parquet?raw=true'
df = pd.read_parquet(parquet_file, engine='auto')
df.to_csv('yellow_tripdata.csv')
df.head()
VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance RatecodeID store_and_fwd_flag PULocationID DOLocationID payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount congestion_surcharge airport_fee
0 1 2022-08-01 00:17:39 2022-08-01 00:19:58 1.0 0.4 1.0 N 114 148 1 3.5 3.0 0.5 1.80 0.00 0.3 9.10 2.5 0.0
1 1 2022-08-01 00:26:06 2022-08-01 00:31:55 1.0 1.4 1.0 N 79 137 1 6.5 3.0 0.5 2.05 0.00 0.3 12.35 2.5 0.0
2 1 2022-08-01 00:45:49 2022-08-01 00:59:29 1.0 5.2 1.0 N 79 74 1 16.0 3.0 0.5 3.95 0.00 0.3 23.75 2.5 0.0
3 1 2022-08-01 00:05:49 2022-08-01 00:25:42 1.0 9.4 1.0 N 138 113 1 28.0 3.0 0.5 7.65 6.55 0.3 46.00 2.5 0.0
4 1 2022-08-01 00:36:29 2022-08-01 00:51:29 1.0 1.7 1.0 N 137 68 1 11.0 3.0 0.5 1.00 0.00 0.3 15.80 2.5 0.0
# Converting pickup and dropoff columns from object to datetime
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
df.isnull().sum()
VendorID                     0
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          93174
trip_distance                0
RatecodeID               93174
store_and_fwd_flag       93174
PULocationID                 0
DOLocationID                 0
payment_type                 0
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge     93174
airport_fee              93174
dtype: int64

Out of 3.1 million rows, only around 93,000 of them seem to have null values. Which is not a very significant amount

# Creating columns for pickup date and pickup time from the datetime format, same for dropoff 
df['pickup_date'] =  [d.date() for d in df['tpep_pickup_datetime']]
df['pickup_time'] =  [d.time() for d in df['tpep_pickup_datetime']]
df['dropoff_date'] = [d.date() for d in df['tpep_dropoff_datetime']]
df['dropoff_time'] = [d.time() for d in df['tpep_dropoff_datetime']]
df.dtypes
VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
pickup_date                      object
pickup_time                      object
dropoff_date                     object
dropoff_time                     object
dtype: object
# Converting pickup date and dropoff date into datetime format
df['pickup_date'] = pd.to_datetime(df['pickup_date'])
df['dropoff_date'] = pd.to_datetime(df['dropoff_date'])
# Grouping by pickup date and location and getting the sum of all the values relating to the fare
df_group_date = df[['fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount','pickup_date','PULocationID']].groupby(['pickup_date', 'PULocationID']).sum()
# Resetting the index to get values of the sum for each location and date
df_group_date.reset_index(level=[0,1], inplace=True)

Next I created a dates and locations variable which had a list of all the unique pickup dates and pickup locations respectively

dates = list(df['pickup_date'].unique())
locations = list(df['PULocationID'].unique())

Here we created an interactive function that allows the user to pick the date and location to view the total amount of money earned from pickups in the chosen area at the chosen date. There is also a breakdown of the total cost into its many components

@widgets.interact(pickup_date=dates, PULocationID = locations)
def date_location_select(pickup_date = 'pickup_date', PULocationID ='PULocationID'):
    z = df_group_date[(df_group_date['pickup_date']==pickup_date) & (df_group_date['PULocationID'] == PULocationID)]
    z = z.drop(['pickup_date', 'PULocationID'], axis=1)
    if z.empty:
        return False
    ax = z.T.plot(kind="bar", rot=0, title='Breakdown of toal amount paid by day and location', figsize=(10,6))
    ax.set_xticklabels(list(z.columns), rotation = 90)
    ax.set_xlabel("Fare breakdown")
    ax.set_ylabel("Total cost for location")
    ax.get_legend().remove()
{"version_major":2,"version_minor":0,"model_id":"122e2d2248284e408b5cc3db39030ce8"}
data = df.copy()
# Checking statistial values of various columns
pd.set_option('display.float_format', lambda x: '%.4f' % x)
data.describe()
VendorID passenger_count trip_distance RatecodeID PULocationID DOLocationID payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount congestion_surcharge airport_fee
count 3152677.0000 3059503.0000 3152677.0000 3059503.0000 3152677.0000 3152677.0000 3152677.0000 3152677.0000 3152677.0000 3152677.0000 3152677.0000 3152677.0000 3152677.0000 3152677.0000 3059503.0000 3059503.0000
mean 1.7249 1.4231 5.7163 1.4722 163.3933 160.9009 1.2096 14.8191 0.9886 0.4875 2.6663 0.5629 0.2956 21.5539 2.2564 0.1087
std 0.4730 0.9814 565.1028 6.1248 64.8108 70.1000 0.5158 14.8066 1.2382 0.0979 3.2889 2.0721 0.0509 18.2757 0.7893 0.3564
min 1.0000 0.0000 0.0000 1.0000 1.0000 1.0000 0.0000 -900.0000 -5.0000 -0.5000 -155.0000 -54.0000 -0.3000 -895.3000 -2.5000 -1.2500
25% 1.0000 1.0000 1.1300 1.0000 132.0000 112.0000 1.0000 7.0000 0.0000 0.5000 0.0000 0.0000 0.3000 12.3000 2.5000 0.0000
50% 2.0000 1.0000 1.9300 1.0000 161.0000 162.0000 1.0000 10.0000 0.5000 0.5000 2.0600 0.0000 0.3000 15.8700 2.5000 0.0000
75% 2.0000 2.0000 3.7000 1.0000 233.0000 233.0000 1.0000 16.5000 2.5000 0.5000 3.2600 0.0000 0.3000 22.8000 2.5000 0.0000
max 6.0000 9.0000 330578.6600 99.0000 265.0000 265.0000 5.0000 1136.5000 9.0500 3.3000 400.0000 139.4500 0.3000 1139.8000 2.7500 1.2500
# Converting columns to datetime
data['tpep_pickup_datetime']=pd.to_datetime(data['tpep_pickup_datetime'])
data['tpep_dropoff_datetime']=pd.to_datetime(data['tpep_dropoff_datetime'])
# Creating columns for day, weekday, hour and month for pickup and dropoff
data['pickup_day']=data['tpep_pickup_datetime'].dt.day_name()
data['dropoff_day']=data['tpep_dropoff_datetime'].dt.day_name()
data['pickup_weekday']=data['tpep_pickup_datetime'].dt.weekday
data['dropoff_weekday']=data['tpep_dropoff_datetime'].dt.weekday
data['pickup_hour']=data['tpep_pickup_datetime'].dt.hour
data['dropoff_hour']=data['tpep_dropoff_datetime'].dt.hour
data['pickup_month']=data['tpep_pickup_datetime'].dt.month
data['dropoff_month']=data['tpep_dropoff_datetime'].dt.month
# Creating a column to calculate the tip percent
data["tip_percent"] = data["tip_amount"] * 100 / (data["tip_amount"] + data["fare_amount"])
# Plotting average tip percent per day
plt.ticklabel_format(style='plain')    # to prevent scientific notation.
n_by_pickup_percent = data.groupby("pickup_weekday")["tip_percent"].mean()
ax = n_by_pickup_percent.sort_values(ascending = False).plot.bar()
plt.title('Tip Percent Average per day')
plt.xlabel('Pickup Day of week')
plt.ylabel('Tip Percent')
plt.show()

# Average tip percent per hour of the day
plt.ticklabel_format(style='plain')    # to prevent scientific notation.
n_by_pickup_percent = data.groupby("pickup_hour")["tip_percent"].mean()
ax = n_by_pickup_percent.sort_values(ascending = False).plot.bar()
plt.title('Tip Percent Average per hour')
plt.xlabel('Pickup Hour')
plt.ylabel('Tip Percent')
plt.show()

# Average tip percent for the top 40 dropoff locations
plt.ticklabel_format(style='plain')    # to prevent scientific notation.
n_by_pickup_percent = data.groupby("DOLocationID")["tip_percent"].mean().sort_values(ascending = False).plot.bar()
#n_by_pickup_percent = n_by_pickup_percent.sort_values(ascending = False).plot.bar()
plt.title('Tip Percent Average per Drop off Location')
plt.xlabel('DOLocationID')
plt.ylabel('Tip Percent')
plt.xlim((1,40))
plt.show()

plt.figure(figsize=(18,6))
sns.histplot(data=data, x='pickup_day', hue='pickup_day', palette="husl")
plt.title("Which days see the most customers?")
plt.xlabel('Days of the week')
plt.ylabel('Pickup count')
plt.show()

From the above visualization, we can conclude that Tuesdays and Wednesdays are the busiest days of the week. Taxis are used the least on Sundays.

As a Taxi Driver, I will know that I am more likely to get more customers on Tuesdays and Wednesdays.

plt.figure(figsize=(18,6))
sns.countplot(data=data, x='pickup_hour', palette="husl")
plt.title("Which hours of the day see the most customers?")
plt.xlabel('Hours of the day')
plt.ylabel('Pickup count')
plt.show()

From this visualization, we can dedude that afternoon and evening hours see more customers. The busiest hours of the day are between 10 am to 9 pm.

data.VendorID.unique()
array([1, 2, 6, 5])
plt.figure(figsize=(18,6))
sns.countplot(data=data, x='pickup_day', hue='VendorID', palette="husl")
plt.title("Which Vendor are preferred on a given day?")
plt.xlabel('Vendors')
plt.ylabel('Pickup count')
plt.show()

On most days, Vendor 2 followed by Vendor 1 is preferred by Customers.

data.payment_type.unique()
array([1, 2, 3, 4, 5, 0])
plt.figure(figsize=(18,6))
sns.countplot(data=data, x='pickup_day', hue='payment_type', palette="husl")
plt.title("Which payment methods are preferred on a given day?")
plt.xlabel('Payment Methods used during the trips')
plt.ylabel('Pickup count')
plt.show()

On all days of the week, payment type 1 is preferred. While payment type 4 and 5 are rarely used.

plt.figure(figsize=(20,6))
sns.lineplot(data=data, x="pickup_hour", y="tip_amount", hue="pickup_day", style="pickup_day")


plt.title("Tip amount according to the hour and day of the week")
plt.xlabel('Hour of the day')
plt.ylabel('Tip Amount')
plt.legend(title = "Days of the Week")

plt.show()

From this visualization, we can conclude that higher tips are given after midnight especially at 12-1 am and 4-6am.

plt.figure(figsize=(20,6))
sns.lineplot(data=data, x="pickup_hour", y="fare_amount", hue="pickup_day")
#plt.ylim([1,3,5,7,9,11,13,15,17,19,21,23])

plt.title("Fare amount according to the hour and day of the week")
plt.xlabel('Hour of the day')
plt.ylabel('Fare Amount')
plt.legend(title = "Days of the Week")
plt.show()

From this visualization, we dedude that fares are higher after midnight. I'm guessing that taxis have a higher rate for night time.

plt.figure(figsize=(12, 6))
sns.scatterplot(data=data, x="pickup_day", y="tip_amount", hue="pickup_day", palette="husl")
plt.ylim(0,200)

plt.title("Which days see the most tips?")
plt.xlabel('Days of the week')
plt.ylabel('Tip Amount')
plt.legend(title = "Days of the Week")
plt.show()

This visualization needs to be worked on further. I can't accurately tell which days receive the most tips from the customers.

plt.figure(figsize=(12, 6))
sns.scatterplot(data=data, x="pickup_day", y="trip_distance", hue="pickup_day", palette="husl")
plt.ylim(0,350)

plt.title("Which days see the most distance covered by taxis?")
plt.xlabel('Days of the week')
plt.ylabel('Trip Distance')
plt.legend(title = "Days of the Week")

plt.show()

Similarly, I can't accurately tell whichs cover the most distance. Need to be worked upon.

import ipywidgets as widgets

criteria_rb = widgets.RadioButtons(
    options=['extra', 'mta_tax', 'tolls_amount', 'airport_fee'],
    description='Other fares:',
    disabled=False)


def plot_extra(criteria):
    plt.figure(figsize=(12, 6))
    sns.barplot(x= 'pickup_day', y=criteria, data=data, palette="husl")
    plt.title("Other fares that are charged across the week")

widgets.interact(plot_extra, criteria = criteria_rb)

There are some extra charges on top of the fare amount, such as - mta tax, tolls amount, airport fee etc. I wanted to visualize - which of these extra fees are charged more or less given a particular day of the week.

fig = px.scatter(
    df,
    x = 'fare_amount',
    y = 'tip_amount',

)
fig.show()
fig_1 = px.scatter(
    df,
    x = 'fare_amount',
    y = 'tip_amount',
    color = 'payment_type'

)
fig_1.show()